import jieba


def find_data(txt, name):
    """
    过滤商品名
    txt：文本
    name：指定商品名的关键词
    """

    if name in txt:
        return txt
    else:
        pass


def data_process(txt):
    """
    删除无效评论
    txt：文本
    """

    if txt == '此用户没有填写评论!':
        return None
    elif '好评模板' in txt:
        return None
    elif len(txt) >= 5:
        return txt
    elif '&hellip' in txt:
        return txt.replace('&hellip', '')
    else:
        return None


def distinct_(st):
    """
    文本机械去重
    st：文本
    """

    for i in range(1, int(len(st) / 2) + 1):
        for j in range(len(st)):
            if st[j:j + i] == st[j + i:j + 2 * i]:
                k = j + i
                while st[k:k + i] == st[k + i:k + 2 * i] and k < len(st):
                    k = k + i
                st = st[:j] + st[k:]
    return st


def get_custom_stopwords(stop_words_file):
    """
    获取哈工大停用词表
    stop_words_file：文件路径
    """
    with open(stop_words_file, encoding='utf-8') as f:
        stopwords = f.read()
    stopwords_list = stopwords.split('\n')
    custom_stopwords_list = [i for i in stopwords_list]
    return custom_stopwords_list


def chinese_word_cut(mytext):
    """
    利用jieba进行中文分词
    mytext：文本
    """
    stopwords = get_custom_stopwords(r'G:\毕业设计\情感分析\corpus\hit_stopwords.txt')
    words = jieba.lcut(mytext)
    result = []
    for word in words:
        if word not in stopwords:
            result.append(word)
    return ' '.join(result)
